# Installing addional libraries
#!pip install networkx
#!pip install powerlaw
# Importing libraries
import pandas as pd
import networkx as nx
import collections
import statistics as stats
import time
import seaborn as sns
import json
import numpy as np
import random
import matplotlib.pyplot as plt
import matplotlib.colors as mcolors
import powerlaw
import warnings
warnings.filterwarnings('ignore')
%matplotlib inline
# Reading the json file
f = open("data/internal-references-pdftotext.json")
data = json.load(f)
# Show how the data is presented
list(data.items())[50:60]
# Length of the data
orig_len = len(data)
print(orig_len)
# Get N elements in the dictionary, because of limited computational resources
n_sample = 50000
random.seed(12)
data = dict(random.sample(data.items(), n_sample))
#data = dict(list(data.items())[:n_sample]) #first elements
#data = data #all the data
print("Working with the "+str(round(100*len(data)/orig_len,2))+"% of the original data")
# Consider only papers with more than 4 citations
keys = []
for key, value in data.items():
if len(value) >= 4:
keys.append(key)
data = { your_key: data[your_key] for your_key in keys }
# Show how the data is presented
list(data.items())[0:4]
# Construct the dataframe
papers_main = []
papers_refs = []
for key in data:
if len(data[key])!=0:
#print("")
#print(key, "->", data[key])
for _ in range(len(data[key])):
#print(key, data[key][_])
papers_main.append(key)
papers_refs.append(data[key][_])
# Sanity check
print(len(papers_main))
print(len(papers_refs))
# The TOP papers are the cited and the SUB, the ones that cited
df = pd.DataFrame({'top':papers_refs, 'sub':papers_main})
# Sanity check
print(len(df))
print(df.shape)
df.head()
# Remove duplicates rows
df = df.drop_duplicates(keep=False, inplace=False)
print(df.shape)
df.head()
# Consider only the TOP papers that cite other papers, not those that do not cite any other paper
intersection = list(set(list(df["top"])).intersection(list(df["sub"])))
papers_top = []
papers_sub = []
for idx in range(0,len(df)):
if (df["top"].iloc[idx] in intersection):
papers_top.append(df["top"].iloc[idx])
papers_sub.append(df["sub"].iloc[idx])
df = pd.DataFrame({'top':papers_top, 'sub':papers_sub})
print(df.shape)
df.head()
# Saving for later use in Cytoscape
df.to_csv("cytoscape/citations_network.csv", sep='\t', encoding='utf-8', index=False)
GG = nx.from_pandas_edgelist(df, source="top",
target="sub", edge_attr=None,
create_using=nx.DiGraph())
print(nx.info(GG))
Weakly Connected Components: Group all the nodes in the graph that can access all of the nodes in the group throught undirected edges (lines)
GG_cc = max(nx.weakly_connected_components(GG), key=len)
GG = GG.subgraph(GG_cc)
print(nx.info(GG))
Warning: This step takes a loong time
# The direction of the arrow indicates flow of information
# from the cited paper (beginning of the arrow) to the citing paper (end of the arrow)
spring_pos = nx.spring_layout(GG) # might take a little while
fig = plt.figure(figsize = (40, 30))
ax = fig.add_subplot(111)
ax.axis('off')
nx.draw_networkx(GG, spring_pos, ax = ax, node_size = 10, width = 1, with_labels = False)
plt.title("Entire graph - Default node size")
plt.close();
fig
i) Recognizing that is a power-law distribution:
ii) Discrete probability power law-distribution, indexed by the degree value k (k greater than $k_{min})$
$p(k) = \frac{\alpha-1}{k_{min}}.(\frac{k}{k_{min}})^{-\alpha}$
Let us take the logarithm on each side. What does that lead to?
For degree k greater than $k_{min}$: $log(p(k)) = log(\frac{\alpha-1}{k_{min}}) - \alpha.log(\frac{k}{k_{min}})$
The above expression is linear in $log(\frac{k}{k_{min}})$.
iii) Analogy with linear function: y = cx + b
# Calculating the in-degree
GGs = [d for d in list(set(list(GG.nodes)))]
GGs_in_degrees = [GG.in_degree[d] for d in GGs]
GGs_in_order = [x for y, x in sorted(zip(GGs_in_degrees, GGs), reverse=True)]
GGs_in_degrees_order = sorted((GG.in_degree[d] for d in GGs), reverse=True)
# Plotting the in-degree distribution
degree_count = collections.Counter(GGs_in_degrees_order)
deg, cnt = zip(*degree_count.items())
plt.figure(figsize=(20,10))
plt.bar(deg, cnt, width=1, color='b')
plt.xlabel("Node degree size", fontsize=20)
plt.ylabel("Frequency", fontsize=20)
plt.xticks(fontsize=20)
plt.yticks(fontsize=20)
plt.title("Entire graph - Node in degree distribution", fontsize=20)
plt.grid(color='gray', linestyle='--', linewidth=1)
plt.rc('axes', axisbelow=True)
plt.show()
# It should look linear on a log-log scale
log_deg = np.log(deg)
log_cnt = np.log(cnt)
plt.figure(figsize=(20,10))
plt.plot(log_deg, log_cnt, color='b')
plt.xlabel('Logarithm of node degree size', fontsize=20)
plt.ylabel('Logarithm of frequency', fontsize=20)
plt.xticks(fontsize=20)
plt.yticks(fontsize=20)
plt.title("Entire graph - Node degree distribution - Log-log scale", fontsize=20)
plt.grid(color='gray', linestyle='--', linewidth=1)
plt.rc('axes', axisbelow=True)
plt.show()
# Fitting a power law distribution
# Power laws are probability distributions with the form:p(x)∝x−α
# Used for degree distribution and powerlaw test
in_degree_sequence = sorted([d for n, d in GG.in_degree()], reverse=True)
fit = powerlaw.Fit(in_degree_sequence)
print("alpha:", round(fit.power_law.alpha,3))
print("Kmin:", round(fit.power_law.xmin,3))
print("Intercept:", round(np.log((fit.power_law.alpha-1)/fit.power_law.xmin),3))
print("Slope:", round(-1*fit.power_law.alpha,3))
# Plotting the fitted a power law distribution
plt.figure(figsize=(8, 5))
fig2 = fit.plot_pdf(color='b', linewidth=2, label='Original')
fig2 = fit.power_law.plot_pdf(color='g', linestyle='--', ax=fig2, label='Fitted')
plt.grid(color='gray', linestyle='--', linewidth=1)
plt.title("Fitting a power law distribution")
plt.rc('axes', axisbelow=True)
plt.legend()
R, p = fit.distribution_compare('power_law', 'exponential', normalized_ratio=True)
print("R:", R)
print("p:", p)
Discussion: As can be seen above, the in degree distribution has the following characteristics: the distribution is right skewed and has a high ratio of max to min. This gives us an indication that it can be very well matched with a power-law distribution. And indeed, in the above procedure it is verified that the in degree distribution fits very well with a power-law distribution. In fact, the likelihood ratio test comparisons produces a pair (R,p), where r is the normalized log likelihood ratio and p is the statistical significance of that ratio. So, being tested for the p-value is whether the sign of r is meaningful. As we can see, we obtained a p<0.05 for a likelihood ratio test, this indicates that the power-law model is favored.
# Calculating the out-degree
GGs = [d for d in list(set(list(GG.nodes)))]
GGs_out_degrees = [GG.out_degree[d] for d in GGs]
GGs_out_order = [x for y, x in sorted(zip(GGs_out_degrees, GGs), reverse=True)]
GGs_out_degrees_order = sorted((GG.out_degree[d] for d in GGs), reverse=True)
# Plotting the out-degree distribution
degree_count = collections.Counter(GGs_out_degrees_order)
deg, cnt = zip(*degree_count.items())
plt.figure(figsize=(20,10))
plt.bar(deg, cnt, width=1, color='b')
plt.xlabel("Node degree size", fontsize=20)
plt.ylabel("Frequency", fontsize=20)
plt.xticks(fontsize=20)
plt.yticks(fontsize=20)
plt.title("Entire graph - Node out degree distribution", fontsize=20)
plt.grid(color='gray', linestyle='--', linewidth=1)
plt.rc('axes', axisbelow=True)
plt.show()
# It should look linear on a log-log scale
log_deg = np.log(deg)
log_cnt = np.log(cnt)
plt.figure(figsize=(20,10))
plt.plot(log_deg, log_cnt, color='b')
plt.xlabel('Logarithm of node degree size', fontsize=20)
plt.ylabel('Logarithm of frequency', fontsize=20)
plt.xticks(fontsize=20)
plt.yticks(fontsize=20)
plt.title("Entire graph - Node degree distribution - Log-log scale", fontsize=20)
plt.grid(color='gray', linestyle='--', linewidth=1)
plt.rc('axes', axisbelow=True)
plt.show()
# Fitting a power law distribution
# Power laws are probability distributions with the form:p(x)∝x−α
# Used for degree distribution and powerlaw test
out_degree_sequence = sorted([d for n, d in GG.out_degree()], reverse=True)
fit = powerlaw.Fit(out_degree_sequence)
print("alpha:", round(fit.power_law.alpha,3))
print("Kmin:", round(fit.power_law.xmin,3))
print("Intercept:", round(np.log((fit.power_law.alpha-1)/fit.power_law.xmin),3))
print("Slope:", round(-1*fit.power_law.alpha,3))
# Plotting the fitted a power law distribution
plt.figure(figsize=(8, 5))
fig2 = fit.plot_pdf(color='b', linewidth=2, label='Original')
fig2 = fit.power_law.plot_pdf(color='g', linestyle='--', ax=fig2, label='Fitted')
plt.grid(color='gray', linestyle='--', linewidth=1)
plt.title("Fitting a power law distribution")
plt.rc('axes', axisbelow=True)
plt.legend()
R, p = fit.distribution_compare('power_law', 'exponential', normalized_ratio=True)
print("R:", R)
print("p:", p)
Discussion: As can be seen above, the out degree distribution has the following characteristics: the distribution is right skewed and has a high ratio of max to min. This gives us an indication that it can be very well matched with a power-law distribution. And indeed, in the above procedure it is verified that the out degree distribution fits very well with a power-law distribution. In fact, the likelihood ratio test comparisons produces a pair (R,p), where r is the normalized log likelihood ratio and p is the statistical significance of that ratio. So, being tested for the p-value is whether the sign of r is meaningful. As we can see, we obtained a p<0.05 for a likelihood ratio test, this indicates that the power-law model is favored.
# Read the ids and titles from "data/oai-arxiv-metadata-hash-abstracts-2019-03-01.json"
#filename = "data/oai-arxiv-metadata-hash-abstracts-2019-03-01.json"
#with open(filename) as f:
# info_ = [json.loads(line) for line in f]
#info_aXv = pd.DataFrame.from_dict(info_)
#info_aXv = pd.DataFrame({'idAxv':info_aXv["id"], 'title':info_aXv["title"]})
#info_aXv["idAxv"] = info_aXv["idAxv"].apply(lambda x: "x"+str(x))
#info_aXv["title"] = info_aXv["title"].apply(lambda x: x.translate({ord("\n"):None}))
#info_aXv.to_csv("data/infos_idd_names.csv", sep='\t', encoding='utf-8', index=False) # save for future use
# Read the ids and titles from "data/infos_idd_names.csv"
filename = "data/infos_idd_names.csv"
info_aXv = pd.read_csv(filename, sep='\t', index_col=0)
info_aXv.reset_index(inplace=True)
info_aXv["idAxv"] = info_aXv["idAxv"].apply(lambda x: x[1:])
def draw(G, pos, measures, measure_name):
plt.figure(figsize = (40, 30))
nodes = nx.draw_networkx_nodes(G, pos, node_size=90, cmap=plt.cm.plasma,
node_color=list(measures.values()),
nodelist=measures.keys())
nodes.set_norm(mcolors.SymLogNorm(linthresh=0.01, linscale=1, base=10))
#labels = nx.draw_networkx_labels(G, pos)
edges = nx.draw_networkx_edges(G, pos)
plt.title(measure_name)
plt.colorbar(nodes)
plt.axis('off')
plt.show()
def sorting(cetralts, n_tops):
cetralts_tops = sorted(cetralts.items(), key=lambda item: item[1], reverse=True)[:n_tops]
return cetralts_tops
def infos(centrs_tops):
top_i = 1
for idd,val in centrs_tops:
print("Top",str(top_i), ", Id arXiv:", idd, ", Centrality:",round(val,4))
print("Title:", info_aXv[info_aXv["idAxv"]==idd]["title"].values[0],"\n")
top_i += 1
The most important centrality is PageRank, because this measure uncovers nodes whose influence extends beyond their direct connections into the wider network.
# Page rank centrality
page_rank = nx.pagerank(GG, alpha = 0.85)
page_rank_tops = sorting(page_rank, 10)
infos(page_rank_tops)
#draw(GG, nx.spring_layout(GG), nx.pagerank(GG, alpha = 0.85), 'Page Rank Centrality')
# In degree centrality
in_deg = nx.in_degree_centrality(GG)
in_deg_tops = sorting(in_deg, 10)
infos(in_deg_tops)
#draw(GG, nx.spring_layout(GG), nx.in_degree_centrality(GG), 'In Degree Centrality')
# Out degree centrality
out_deg = nx.out_degree_centrality(GG)
out_deg_tops = sorting(out_deg, 10)
infos(out_deg_tops)
#draw(GG, nx.spring_layout(GG), nx.out_degree_centrality(GG), 'Out Degree Centrality')
# Closeness centrality
closeness = nx.closeness_centrality(GG)
closeness_tops = sorting(closeness, 10)
infos(closeness_tops)
#draw(GG, nx.spring_layout(GG), nx.closeness_centrality(GG), 'Closeness Centrality')
# Eigenvector centrality
eig = nx.eigenvector_centrality(GG, max_iter=600)
eig_tops = sorting(eig, 10)
infos(eig_tops)
#draw(GG, nx.spring_layout(GG), nx.eigenvector_centrality(GG, max_iter=600), 'Eigenvector Centrality')
# Betweeness centrality
betw = nx.betweenness_centrality(GG)
betw_tops = sorting(betw, 10)
infos(betw_tops)
#draw(GG, nx.spring_layout(GG), nx.betweenness_centrality(GG), 'Betweeness Centrality')
# Katz centrality
katz = nx.katz_centrality(GG, alpha=0.1, beta=1.0)
katz_tops = sorting(katz, 10)
infos(katz_tops)
#draw(GG, nx.spring_layout(GG), nx.katz_centrality(GG, alpha=0.1, beta=1.0), 'Katz Centrality')
# Hub scores
hubs, authorities = nx.hits(GG, max_iter=600)
hubs_tops = sorting(hubs, 10)
infos(hubs_tops)
#draw(GG, nx.spring_layout(GG), nx.hits(GG, max_iter=600)[0], 'Hub Scores')
# Authority scores
hubs, authorities = nx.hits(GG, max_iter=600)
authorities_tops = sorting(authorities, 10)
infos(authorities_tops)
#draw(GG, nx.spring_layout(GG), nx.hits(GG, max_iter=600)[1], 'Authority Scores')
# Harmonic centrality
harmonic = nx.harmonic_centrality(GG)
harmonic_tops = sorting(harmonic, 10)
infos(harmonic_tops)
#draw(GG, nx.spring_layout(GG), nx.harmonic_centrality(GG), 'Harmonic Centrality')
# Collecting centralities in a single dataframe
data_tmp = {'page rank': list(dict(page_rank_tops).keys()),
'in degree': list(dict(in_deg_tops).keys()),
'out degree': list(dict(out_deg_tops).keys()),
'closeness': list(dict(closeness_tops).keys()),
'eigenvector': list(dict(eig_tops).keys()),
'betweeness': list(dict(betw_tops).keys()),
'katz': list(dict(katz_tops).keys()),
'hub scores': list(dict(hubs_tops).keys()),
'authority scores': list(dict(authorities_tops).keys()),
'harmonic': list(dict(harmonic_tops).keys())}
df_centralities = pd.DataFrame(data_tmp)
df_centralities
From the table above we can see that several papers or nodes within the top 10 of Page Rank centrality such as 0809.1869, 1611.05079, 1309.3638, and 1605.02016 are repeated in the top 10 measures of various centralities, this because these nodes are the most influential in the network, and therefore it is expected that they appear in several places in the table.
In addition, several of the Page Rank values seen in the table coincide with the Katz values, and this is because the Page Rank centrality measure is an improvement of the Katz centrality.
As mentioned above, from the definition of Page Rank centrality, it is concluded that it is the best option to find the most influential papers in arXiv.
# Filtering only fields
df_filt = df
df_filt = df_filt[df_filt["top"].apply(lambda x: True if (x.find('/')!=-1) else False)]
df_filt = df_filt[df_filt["sub"].apply(lambda x: True if (x.find('/')!=-1) else False)]
df_filt = df_filt[df_filt["top"].apply(lambda x: True if (x[0].isnumeric()!=True) else False)]
df_filt = df_filt[df_filt["sub"].apply(lambda x: True if (x[0].isnumeric()!=True) else False)]
print(df_filt.shape)
df_filt.head()
# Construct dataframe
fields_main = df_filt["top"].apply(lambda x: x[0:x.find('/')])
fields_refs = df_filt["sub"].apply(lambda x: x[0:x.find('/')])
df_fields = pd.DataFrame({'top':fields_main, 'sub':fields_refs})
df_fields = df_fields.reset_index(drop=True)
print(df_fields.shape)
df_fields.head()
# Complete the number of citations between fields
df_fields = df_fields.groupby(df_fields.columns.tolist()).size().reset_index().rename(columns={0:'count'})
print(df_fields.shape)
df_fields.head()
# Filter count >= 3
df_fields = df_fields[df_fields["count"]>=3]
print(df_fields.shape)
df_fields.head()
# Filter fields that reference itselfs
df_fields = df_fields[df_fields["top"]!=df_fields["sub"]]
print(df_fields.shape)
df_fields.head()
# Saving for use in Cytoscape
df_fields.to_csv("cytoscape/fields_network.csv", sep='\t', encoding='utf-8', index=False)
GG_fields = nx.from_pandas_edgelist(df_fields, source="top",
target="sub", edge_attr="count",
create_using=nx.DiGraph())
print(nx.info(GG_fields))
The size of each node is a function of the centrality of the in degree and the width of each edge is a function of the count values
spring_pos = nx.spring_layout(GG_fields) # might take a little while
fig = plt.figure(figsize = (40, 30))
ax = fig.add_subplot(111)
ax.axis('off')
nx.draw_networkx(GG_fields,
spring_pos,
ax = ax,
node_color='orange',
node_size=[v*1200 for v in dict(GG_fields.in_degree()).values()],
font_size=35,
with_labels=True,
arrowsize=60,
width=list(1+df_fields["count"].values/2),
connectionstyle="arc3,rad=0.2",
edge_color='steelblue')
plt.title("Entire graph - Default node size")
plt.close();
fig
The size of each node is a function of the centrality of the in degree and the width of each edge is a function of the count values
The Cytoscape file is located in the folder "cytoscape"